library(pacman)
p_load(stringr,
       tidyverse)

##### Messages - FIRST
messages1 <- read.csv("04-Perspective-API/data/01-post-experiment/01-experiment-1-messages.csv", stringsAsFactors = F)
messages1$message_body <- tolower(messages1$message_body)
messages1$message_body <- gsub(",", "", messages1$message_body)
messages1 <- messages1[1:2]

# messages <- messages1 %>%
#   separate(message_body, paste("V", 1:100, sep=""), extra = "drop", fill = "right") # Split into columns--haven't checked the actual max number needed here

# Names
namedf <- read.csv("04-Perspective-API/data/03-cleaning-datasets/02-first-names.csv", stringsAsFactors = F)
common.words <- read.delim("04-Perspective-API/data/03-cleaning-datasets/01-common-words.txt", stringsAsFactors = FALSE, header = FALSE)
namedf$firstname <- tolower(namedf$firstname)

namedf <- anti_join(x = namedf, y = common.words,
                    by = c("firstname" = "V1")) # get rid of names that are also common words

messages1$endchar <- substr(messages1$message_body, 
                            nchar(messages1$message_body), 
                            nchar(messages1$message_body))
messages1$punct <- ifelse(grepl("[[:punct:]]$", messages1$endchar) | messages1$endchar==" ", 1, 0)
messages1$cleaned <- ifelse(messages1$punct==0, paste0(messages1$message_body, "."), messages1$message_body)
                    # add period to end of ones that end with no punctuation so can catch ones where name appears at end (no space afterwards)


namedf.edit <- paste0(" ", namedf$firstname, " ")  #adding space before and after name
removewords <- paste(unlist(t(namedf.edit)), collapse="|") # Converts names into a single character vector entry with 'or' separators
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords, " [NAME] ") 
                    # RESOLVED = Does replace correctly. But matches any instance, not just whole word. Can you problem-solve? 

#filter again, but this time target situations in which someone says "[NAME]."
namedf.end <- paste0(" ", namedf$firstname, "\\.")  #adding space before, period after
removewords.end <- paste(unlist(t(namedf.end)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.end, " [NAME].") 

#filter again, but this time target situations in which someone says "[NAME]?"
namedf.q <- paste0(" ", namedf$firstname, "\\?")  #adding space before, question after
removewords.q <- paste(unlist(t(namedf.q)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.q, " [NAME]?") 

#filter again, but this time target situations in which someone says "[NAME]!"
namedf.e <- paste0(" ", namedf$firstname, "\\!")  #adding space before, exclamation after
removewords.e <- paste(unlist(t(namedf.e)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.e, " [NAME]!") 

#filter again, but this time target situations in which someone says "[NAME],"
namedf.c <- paste0(" ", namedf$firstname, "\\,")  #adding space before, comma after
removewords.c <- paste(unlist(t(namedf.c)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.c, " [NAME],") 

#filter again, but this time target situations in which someone says " [NAME]'s"
namedf.a <- paste0(" ", namedf$firstname, "\\'s")  #adding space before, apostrophe s after
removewords.a <- paste(unlist(t(namedf.a)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.a, " [NAME]'s") 

#filter again, but this time target situations in which someone says "not [NAME]"
namedf.not <- paste("not", namedf$firstname) 
removewords.not <- paste(unlist(t(namedf.not)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.not, "not [NAME]") 

#filter again, but this time target situations in which someone says "[NAME] " at beginning of text
namedf.start <- paste0("^" ,namedf$firstname, " ") 
removewords.start <- paste(unlist(t(namedf.start)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.start, "[NAME] ") 

#filter again, but this time target situations in which someone says " [NAME]" at end of text
namedf.last <- paste0(" ", namedf$firstname, "$") 
removewords.last <- paste(unlist(t(namedf.last)), collapse="|")
messages1$cleaned <- str_replace_all(messages1$cleaned, removewords.last, "[NAME] ") 

#remove all numbers 
messages1$cleaned <- gsub("[[:digit:]]+", "[number blanked]", messages1$cleaned)

head(messages1$cleaned, 100)
head(messages1$message_body, 50)

messages1clean <- messages1[c("?..conversation_id", "cleaned")]



##### Messages - THIRD
messages2 <- read.csv("04-Perspective-API/data/01-post-experiment/02-experiment-2-messages.csv", stringsAsFactors = F)
messages2$message_body <- tolower(messages2$comment)
messages2$message_body <- gsub(",", "", messages2$message_body)
messages2 <- messages2[c(1,4)]

# messages <- messages2 %>%
#   separate(message_body, paste("V", 3:300, sep=""), extra = "drop", fill = "right") # Split into columns--haven't checked the actual max number needed here

# Names
namedf <- read.csv("04-Perspective-API/data/03-cleaning-datasets/02-first-names.csv", stringsAsFactors = F)
common.words <- read.delim("04-Perspective-API/data/03-cleaning-datasets/01-common-words.txt", stringsAsFactors = FALSE, header = FALSE)
namedf$firstname <- tolower(namedf$firstname)

namedf <- anti_join(x = namedf, y = common.words,
                    by = c("firstname" = "V1")) # get rid of names that are also common words

messages2$endchar <- substr(messages2$message_body, 
                            nchar(messages2$message_body), 
                            nchar(messages2$message_body))
messages2$punct <- ifelse(grepl("[[:punct:]]$", messages2$endchar) | messages2$endchar==" ", 3, 0)
messages2$cleaned <- ifelse(messages2$punct==0, paste0(messages2$message_body, "."), messages2$message_body)
# add period to end of ones that end with no punctuation so can catch ones where name appears at end (no space afterwards)


namedf.edit <- paste0(" ", namedf$firstname, " ")  #adding space before and after name
removewords <- paste(unlist(t(namedf.edit)), collapse="|") # Converts names into a single character vector entry with 'or' separators
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords, " [NAME] ") 
# RESOLVED = Does replace correctly. But matches any instance, not just whole word. Can you problem-solve? 

#filter again, but this time target situations in which someone says "[NAME]."
namedf.end <- paste0(" ", namedf$firstname, "\\.")  #adding space before, period after
removewords.end <- paste(unlist(t(namedf.end)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.end, " [NAME].") 

#filter again, but this time target situations in which someone says "[NAME]?"
namedf.q <- paste0(" ", namedf$firstname, "\\?")  #adding space before, question after
removewords.q <- paste(unlist(t(namedf.q)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.q, " [NAME]?") 

#filter again, but this time target situations in which someone says "[NAME]!"
namedf.e <- paste0(" ", namedf$firstname, "\\!")  #adding space before, exclamation after
removewords.e <- paste(unlist(t(namedf.e)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.e, " [NAME]!") 

#filter again, but this time target situations in which someone says "[NAME],"
namedf.c <- paste0(" ", namedf$firstname, "\\,")  #adding space before, comma after
removewords.c <- paste(unlist(t(namedf.c)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.c, " [NAME],") 

#filter again, but this time target situations in which someone says " [NAME]'s"
namedf.a <- paste0(" ", namedf$firstname, "\\'s")  #adding space before, apostrophe s after
removewords.a <- paste(unlist(t(namedf.a)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.a, " [NAME]'s") 

#filter again, but this time target situations in which someone says "not [NAME]"
namedf.not <- paste("not", namedf$firstname) 
removewords.not <- paste(unlist(t(namedf.not)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.not, "not [NAME]") 

#filter again, but this time target situations in which someone says "[NAME] " at beginning of text
namedf.start <- paste0("^" ,namedf$firstname, " ") 
removewords.start <- paste(unlist(t(namedf.start)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.start, "[NAME] ") 

#filter again, but this time target situations in which someone says " [NAME]" at end of text
namedf.last <- paste0(" ", namedf$firstname, "$") 
removewords.last <- paste(unlist(t(namedf.last)), collapse="|")
messages2$cleaned <- str_replace_all(messages2$cleaned, removewords.last, "[NAME] ") 

#remove all numbers 
messages2$cleaned <- gsub("[[:digit:]]+", "[number blanked]", messages2$cleaned)

head(messages2$cleaned, 300)
head(messages2$message_body, 50)

messages2clean <- messages2[c("message_id", "cleaned")]
names(messages2clean) <- names(messages1clean)

full_messages <- rbind(messages1clean, messages2clean)

write.csv(full_messages, "04-Perspective-API/data/02-clean-data/clean_data.csv", row.names = F)
